In [113]:
import pickle
import gensim
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

import constants, transactions

%matplotlib inline

In [3]:
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)

In [4]:
prior_orders = tle.get_orders_items('prior')

In [6]:
train_orders = tle.get_orders_items('train')

In [30]:
products = tle.get_items('products')

In [7]:
prior_orders['product_id'] = prior_orders['product_id'].astype(str)

In [8]:
train_orders['product_id'] = train_orders['product_id'].astype(str)

In [11]:
prior_products = prior_orders.groupby('order_id').apply(lambda grp: grp['product_id'].tolist())

In [12]:
train_products = train_orders.groupby('order_id').apply(lambda grp: grp['product_id'].tolist())

In [13]:
sentences = prior_products.append(train_products)

In [14]:
longest = np.max(sentences.apply(len))

In [17]:
sentences = sentences.values

gensim word2vec

  • 输入文本
    • 每行是一个句子(list of str)

In [19]:
%%time
model = gensim.models.Word2Vec(sentences, size=100, window=longest, min_count=2, workers=4)


CPU times: user 7min 40s, sys: 3.91 s, total: 7min 44s
Wall time: 2min 19s

In [20]:
vocab = list(model.wv.vocab.keys())

In [21]:
pca = PCA(n_components=2)
pca.fit(model.wv.syn0)


Out[21]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [28]:
def get_batch(vocab, model, n_batches=3):
    output = list()
    for i in range(0, n_batches):
        rand_int = np.random.randint(len(vocab), size=1)[0] # 从vocabulary中选择一个词
        suggestions = model.most_similar(positive=[vocab[rand_int]], topn=5)# 依照word2vec模型选出最相似的5个
        suggest = list()
        for i in suggestions:
            suggest.append(i[0])
        output += suggest # 相似词列表
        output.append(vocab[rand_int]) # 中心词
    return output

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    """From Tensorflow's tutorial."""
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
#     plt.savefig(filename)
    plt.show()

In [41]:
embeds = []
labels = []
for item in get_batch(vocab, model, n_batches=1):
    embeds.append(model[item])
    labels.append(products.loc[int(item)]['product_name']) 
embeds = np.array(embeds)
embeds = pca.fit_transform(embeds)
plot_with_labels(embeds, labels)



In [44]:
model.save(constants.W2V_DIR + "product2vec.model")

In [48]:
?? model.score

In [74]:
pca = PCA(n_components=25)
pca.fit(model.wv.syn0)


Out[74]:
PCA(copy=True, iterated_power='auto', n_components=25, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [100]:
compressed = pd.DataFrame(pca.transform(model[vocab]), columns=['w2v_dim_%d'%i for i in range(25)])

In [102]:
compressed['product_id'] = vocab

In [103]:
compressed['product_id'] = compressed['product_id'].astype(int)

In [106]:
compressed = pd.merge(products[['product_id']], compressed, on=['product_id'], how='left')

In [110]:
for col in compressed.columns:
    compressed[col] = compressed[col].fillna(compressed[col].mean())

In [114]:
with open(constants.FEAT_DATA_DIR + 'p_w2v_feat.pkl', 'wb') as f:
    pickle.dump(compressed, f, pickle.HIGHEST_PROTOCOL)

In [ ]: